/****************

ITA_import.do
--Imports annual ITA data and merges into one analysis file

****************/

/*When this script is executed, Stata's working directory should be set to the Project/ folder*/

clear

set more off
version 18

*Set the working directory to the Project/ folder

*Append annual tables together -- all must first be in .dta format

import delimited "Data\InputData\ITA Data CY 2017.csv", varnames(1) bindquote(strict) clear 

*Some duplicates are separate reports filed by the same workplace at different times, and thus have different injury rates reported. Default to using the most recent report, so sort observations in order of most recent report.

tempfile ITA_2017
save `ITA_2017', replace

import delimited "Data\InputData\ITA Data CY 2018.csv", varnames(1) bindquote(strict) clear 

tempfile ITA_2018
save `ITA_2018', replace

import delimited "Data\InputData\ITA Data CY 2019.csv", varnames(1) bindquote(strict) clear

tempfile ITA_2019
save `ITA_2019', replace


import delimited "Data\InputData\ITA Data CY 2020.csv", varnames(1) bindquote(strict) clear


tempfile ITA_2020
save `ITA_2020', replace


import delimited "Data\InputData\ITA Data CY 2021.csv", varnames(1) bindquote(strict) clear

tempfile ITA_2021
save `ITA_2021', replace


import delimited "Data\InputData\ITA Data CY 2022.csv", varnames(1) bindquote(strict) clear

tempfile ITA_2022
save `ITA_2022', replace


import delimited "Data\InputData\ITA Data CY 2016.csv", varnames(1) bindquote(strict)  clear


append using `ITA_2017.dta' `ITA_2018.dta'  `ITA_2019.dta' `ITA_2020.dta' `ITA_2021.dta' `ITA_2022.dta', force

*variable 'ein' has no observations, so drop
*variable 'change_reason' is not relevant to analysis, so drop

drop ein
drop change_reason

*also drop any blank observations -- according to the ITA Data Dictionary, many of these are redacted test cases, while others are null reports

drop if missing(company_name) & missing(establishment_name)

rename year_filing_for year

rename annual_average_employees ann_empl

*search for missing observations of float variables

local varlist "total_deaths total_dafw_cases total_djtr_cases total_other_cases total_dafw_days total_djtr_days total_injuries total_poisonings total_respiratory_conditions establishment_id year"

foreach x of local varlist {
	drop if `x'==.
} // 1 missing obs for total_deaths

gsort establishment_id -created_timestamp

duplicates drop establishment_id year, force

drop if state!="OH" & state!="KY"

sort establishment_id
by establishment_id: egen years_available=count(establishment_id)

tabulate years_available

drop if years_available!=7

drop if establishment_id == 232939

tabulate years_available // 17,381 observations remaining

tabulate year

gen state_dummy = 0
replace state_dummy = 1 if state=="KY"

gen time_dummy_2017 = 0
replace time_dummy = 1 if year>=2017

gen state_time = state_dummy*time_dummy

*generate categorical breaking workplaces up by sector, according to designations in union data (Hirsch, Macpherson, Even 2023)
/*
NAICS codes corresponding to groups
Public - 92
Private - Excluding 92
Construction - 23
Manufacture - 31-33
*/

*split NAICS codes up to get first two digits separate

nsplit naics_code, digits(2 4) //naics_code1 will be the variable used to create sector observations

*generate sector variable to match union data

gen sector="Private"
replace sector="Priv. Construction" if naics_code1==23
replace sector="Priv. Manufacturing" if naics_code1==31 | naics_code1==32 | naics_code1==33
replace sector="Public" if naics_code1==92

***Create dummy for whether or not the workplace is in the Cincinnati metro area -- on the border of OH and KY.

*macro for relevant zip codes -- zip codes covered by the OH and KY counties in the Cincinnati MSA, per US Census Bureau

tostring zip_code, replace

local zips1 `" "41005", "41005", "41022", "41042", "41048", "41080", "41091", "41092", "41094" "' 

local zips2 `" "41011", "41012", "41014", "41015", "41016", "41017", "41019", "41025", "41051" "'

local zips3 `" "41053", "41063", "41001", "41007", "41059", "41071", "41072", "41073", "41074" "'

local zips4 `" "41075", "41076", "41085", "41099", "41046", "41086", "41095", "41010", "41030" "'

local zips5 `" "41035", "41052", "41054", "41097", "41006", "41033", "41040", "41002", "41004" "'

local zips6 `" "41043", "41044", "41061", "45003", "45004", "45011", "45012", "45013", "45014" "'

local zips7 `" "45015", "45018", "45042", "45044", "45050", "45053", "45055", "45056", "45061" "'

local zips8 `" "45062", "45063", "45064", "45067", "45069", "45071", "45005", "45032", "45034" "'

local zips9 `" "45036", "45039", "45040", "45054", "45065", "45066", "45068", "45152", "45162" "'

local zips10 `" "45036", "45039", "45040", "45054", "45065", "45066", "45068", "45152", "45162" "'

local zips11 `" "45001", "45002", "45030", "45033", "45041", "45215", "45216", "45217", "45218" "'

local zips12 `" "45051", "45052", "45111", "45174", "45201", "45202", "45203", "45204", "45205" "'

local zips13 `" "45205", "45206", "45207", "45208", "45209", "45211", "45212", "45213", "45214" "'

local zips14 `" "45222", "45223", "45224", "45225", "45226", "45227", "45229", "45230", "45231" "'

local zips15 `" "45241", "45242", "45243", "45244", "45246", "45247", "45248", "45249", "45250" "'

local zips16 `" "45251", "45252", "45253", "45254", "45255", "45258", "56262", "45263", "45264" "'

local zips17 `" "45267", "45268", "45269", "45270", "45271", "45273", "45274", "45275", "45277" "'

local zips18 `" "45280", "45296", "45298", "45299", "45999", "45102", "45103", "45106", "45112" "'

local zips19 `" "45120", "45122", "45140", "45147", "45150", "45153", "45156", "45157", "45158" "'

local zips20 `" "45160", "45176", "45245", "45101", "45115", "45118", "45119", "45121", "45130" "'

local zips21 `" "45131", "45154", "45167", "45168", "45171", "45219", "45220", "45221" "'

gen Cin_MSA = 0

replace Cin_MSA = 1 if inlist(zip_code, `zips1')
replace Cin_MSA = 1 if inlist(zip_code, `zips2')
replace Cin_MSA = 1 if inlist(zip_code, `zips3')
replace Cin_MSA = 1 if inlist(zip_code, `zips4')
replace Cin_MSA = 1 if inlist(zip_code, `zips5')
replace Cin_MSA = 1 if inlist(zip_code, `zips6')
replace Cin_MSA = 1 if inlist(zip_code, `zips7')
replace Cin_MSA = 1 if inlist(zip_code, `zips8')
replace Cin_MSA = 1 if inlist(zip_code, `zips9')
replace Cin_MSA = 1 if inlist(zip_code, `zips10')
replace Cin_MSA = 1 if inlist(zip_code, `zips11')
replace Cin_MSA = 1 if inlist(zip_code, `zips12')
replace Cin_MSA = 1 if inlist(zip_code, `zips13')
replace Cin_MSA = 1 if inlist(zip_code, `zips14')
replace Cin_MSA = 1 if inlist(zip_code, `zips15')
replace Cin_MSA = 1 if inlist(zip_code, `zips16')
replace Cin_MSA = 1 if inlist(zip_code, `zips17')
replace Cin_MSA = 1 if inlist(zip_code, `zips18')
replace Cin_MSA = 1 if inlist(zip_code, `zips19')
replace Cin_MSA = 1 if inlist(zip_code, `zips20')
replace Cin_MSA = 1 if inlist(zip_code, `zips21')

save "Data\IntermediateData\ITA.dta", replace

